{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import cv2\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import *\n",
    "from collections import Counter\n",
    "import seaborn as sns\n",
    "import tqdm\n",
    "import pandas as pd\n",
    "\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "IMAGE_DIR = 'image_contest_level_2'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('image_contest_level_2/labels.txt', sep=' ', header=None)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 统计字符串长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lens = np.array(map(lambda x:len(x.split(';')[1]), df[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lens.min(), lens.max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 统计出现次数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = Counter(''.join([x.decode('utf-8') for x in df[0]]))\n",
    "d = pd.DataFrame(c.most_common(), columns=['word', 'count'])\n",
    "d.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 画柱状图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 9))\n",
    "sns.barplot(d['word'], d['count'], palette=\"Greens_d\")\n",
    "print ' '.join(d['word'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 计算出现频率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = len(df)\n",
    "for i in d.index:\n",
    "    print d['word'][i], d['count'][i] / float(n)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 猜括号生成方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1+1+1+1\n",
    "\n",
    "(1+1)+1+1\n",
    "1+(1+1)+1\n",
    "1+1+(1+1)\n",
    "(1+1+1)+1\n",
    "1+(1+1+1)\n",
    "\n",
    "((1+1)+1)+1\n",
    "(1+(1+1))+1\n",
    "\n",
    "1+((1+1)+1)\n",
    "1+(1+(1+1))\n",
    "\n",
    "(1+1)+(1+1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "2*5/11.0+5/11.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 画出运算符相关系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {}\n",
    "for c in '=+-*(':\n",
    "    data[c] = [x.count(c) for x in df[0]]\n",
    "\n",
    "df2 = pd.DataFrame(data)\n",
    "df2.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(df2.corr(), annot=True, linewidths=.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 猜测生成方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = np.random.randint(3, size=(10000, 3))\n",
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {}\n",
    "for i in range(3):\n",
    "    data[i] = np.count_nonzero(test == i, axis=-1)\n",
    "\n",
    "df2 = pd.DataFrame(data)\n",
    "df2.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(df2.corr(), annot=True, linewidths=.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
